#!/bin/bash

base="https://visibleearth.nasa.gov"
url="$base/view_cat.php?categoryID=1484&order=asc&sequence=data&p="
# the bluemarble collection, sorted by data date, page to be added in loop
dl_path="bluemarble"

filetypes="" # by default, download everything
notothers=0 # by default, download everything
notmonths=0 # by default, download everything

sep="========================="

################################################################################
# FUNCTIONS
usage() {
bold="[1m"
red="[38;5;1m"
yellow="[38;5;3m"
reset="[m(B"
cat << EOF

This script will retrieve images from the

NASA Visible Earth - Blue Marble
================================

collection found at this address:
${url%%&*}

It will attempt to download ALL the image (and some animation) files found in
the sub pages into a structure of subfolders under this base directory:
${bold}${yellow}$dl_path${reset}
This can be changed via the ${bold}${red}-d${reset} option:
${bold}${yellow}$0 ${red}-d ${yellow}"path/to/files"${reset} # relative or absolute

It is possible to restrict the filetypes downloaded by passing a list of
extensions separated through any non-alphanumerical char, e.g.:
${bold}${yellow}$0 ${red}-t ${yellow}jpg:tif:png${reset} # case insensitive

It is possible to NOT download the images for the 12 months (option ${bold}${red}-M${reset}) or
to NOT download any other files (option ${bold}${red}-O${reset}).

Downloading everything will take many hours even with a fast connection and
resulted in an 84G download as of Feb 2019.
E.g. restricting that to only JPG images for only the 12 months:
${bold}${yellow}$0 ${red}-t ${yellow}jpg ${red}-M${reset}
resulted in a 14G download as of Feb 2019 and still took over an hour.

If you interrupt and re-run the script later with the same base directory, it
will not attempt to re-download already downloaded files.

EOF
}
strip_href() {
	retval="${1#*\"}"
	echo "${retval%\"*}"
}
xpath() { 
	xmllint --html --noent --nonet --xpath "$@" 2>/dev/null
	# xmllint creates loads of error messages even when it's working correctly
	# if you really want to see these messages, remove '2>/dev/null'
}
get_path() {
	case "$1" in
	*December*)
	[[ $notmonths == 1 ]] && return 1; echo "12/$(get_sub_path "$1")";;
	*January*)
	[[ $notmonths == 1 ]] && return 1; echo "01/$(get_sub_path "$1")";;
	*February*)
	[[ $notmonths == 1 ]] && return 1; echo "02/$(get_sub_path "$1")";;
	*March*)
	[[ $notmonths == 1 ]] && return 1; echo "03/$(get_sub_path "$1")";;
	*April*)
	[[ $notmonths == 1 ]] && return 1; echo "04/$(get_sub_path "$1")";;
	*May*)
	[[ $notmonths == 1 ]] && return 1; echo "05/$(get_sub_path "$1")";;
	*June*)
	[[ $notmonths == 1 ]] && return 1; echo "06/$(get_sub_path "$1")";;
	*July*)
	[[ $notmonths == 1 ]] && return 1; echo "07/$(get_sub_path "$1")";;
	*August*)
	[[ $notmonths == 1 ]] && return 1; echo "08/$(get_sub_path "$1")";;
	*September*)
	[[ $notmonths == 1 ]] && return 1; echo "09/$(get_sub_path "$1")";;
	*October*)
	[[ $notmonths == 1 ]] && return 1; echo "10/$(get_sub_path "$1")";;
	*November*)
	[[ $notmonths == 1 ]] && return 1; echo "11/$(get_sub_path "$1")";;
	*)
	[[ $notothers == 1 ]] && return 1; sanitize "$1";;
	esac
}
get_sub_path() {
	case "$1" in
	*"Topography and Bathymetry"*) echo "world.topo.bathy"
	;;
	*Topography*) echo "world.topo"
	;;
	*) echo "world"
	;;
	esac
}
sanitize() {
	local sanitized_url="${1//[^a-zA-Z0-9-]/.}"
	echo "${sanitized_url#NASA.Visible.Earth..}"
}

################################################################################
# MAIN
usage
printf "$sep\nUser choices:\n"
while getopts "t:d:OM" opt; do
  case $opt in
    t) filetypes="$OPTARG"
		echo "Filetypes chosen: $filetypes"
      ;;
    d) dl_path="$OPTARG"
		echo "Download path chosen: $PWD/$dl_path"
      ;;
    M) 	notmonths=1
		echo "Not downloading the images for the 12 Months"
      ;;
    O) notothers=1
		echo "Not downloading any Other images"
      ;;
    *) usage; exit 1
      ;;
  esac
done
printf "$sep\n"
read -p "Press Enter to begin or Ctrl-c to abort... "

# the index of the collection spans over several pages, currently three
# loop: keep adding page numbers to the $url...
for ((i=1;i<10;i++)); do
	printf "\n$sep$sep$sep\nPAGE $i: $url$i\n$sep$sep$sep\n"
	# from the grid of index thumbnails for subpages, extract the links
	oldifs="$IFS"
	IFS=$'\n'
	grid=(
	$(wget -qO - "$url$i" | xpath "//div[@class[contains(.,'main')]]/div[@class[contains(.,'col')]]/div[@class=\"caption\"]/a" -)
	)
	IFS="$oldifs"
	# ...until we don't get results anymore
	[[ "X${grid[@]}" == "X" ]] && break

	# for each image page link, get images
	for page in "${grid[@]}"; do
		title="$(xpath "//a/text()" - <<<"$page")"
		echo "Title: $title"
		img_page_link="$(xpath "//a/@href" - <<<"$page")"
		img_page_link="$(strip_href "$img_page_link")"

		path="$(get_path "$title")"
		# only continue if get_path returned a string (a subdirectory to create)
		if [[ "$path" != "" ]]; then
			printf "Downloadpath (subdirectory): $dl_path/$path\n$sep\n"
			img_page="$(wget -qO - "$base/${img_page_link}")"
			img_link=( $(xpath "//div[@id=\"visuals\"]/div/div/a/@href" - <<<"$img_page") )
			for dl in "${img_link[@]}"; do
				dl="$(strip_href "$dl")"
				if [[ "$filetypes" == "" ]] || [[ "${filetypes,,}" =~ (^|[^a-zA-Z0-9-])"${dl##*.}"($|[^a-zA-Z0-9-]) ]]; then
					if [ -r "$dl_path/$path/${dl##*/}" ]; then
						echo "${dl##*/} exists; skipping"
					else
						printf "File to download: ${dl##*/}\nwget says:\n"
						# finally, wget downloads the image into the subfolder calculated.
						# wget creates these folders as needed.
						wget -nc -nv -P "$dl_path/$path" "$dl"
						echo "$sep"
					fi
				fi
			done
		else printf "Nothing to do here.\n$sep\n"
		fi
	done
done

exit
